# Provides a session log for the packages used in this .rmd
library(sessioninfo)
session_info(pkgs = "!attached", to_file = "03a_session_log.txt")
In this script, we go through all the pre-registered proposed analyses. As a reminder, the main research questions where as follows:
Here we present the main “sample theory based” analyses (also known as frequentist), separately on the North American and UK samples in parallel to answer our first two research questions, then together to answer our third research question. In the next section (03b) provide additional Bayesian statistics where a null effect was found, as specified in the pre-registration.
# Library imports, general settings ==============
library(tidyverse)
library(egg)
library(knitr)
library(lme4)
library(lmerTest)
library(simr)
# As in our discussion with Mike, we will use lmerTest for calculating p values
library(lattice)
library(effects)
library(sjPlot)
library(robustlmm)
library(car)
# Load model comparison functions
source("helper/lrtests.R")
# Deal with package priority issues
select <- dplyr::select
theme_set(theme_bw(base_size = 10))
options("future" = T)
# knitr::opts_chunk$set(cache = TRUE)
print(sessionInfo()) # listing all info about R and packages info
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] furrr_0.3.0 future.apply_1.9.0 future_1.27.0
## [4] bridgesampling_1.1-2 rstan_2.21.5 StanHeaders_2.21.0-7
## [7] brms_2.17.0 Rcpp_1.0.9 car_3.1-0
## [10] robustlmm_3.0-1 sjPlot_2.8.10 effects_4.2-1
## [13] carData_3.0-5 lattice_0.20-45 simr_1.0.6
## [16] lmerTest_3.1-3 lme4_1.1-30 Matrix_1.4-1
## [19] knitr_1.39 egg_0.4.5 gridExtra_2.3
## [22] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9
## [25] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [28] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2
## [31] sessioninfo_1.2.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.2 htmlwidgets_1.5.4
## [4] grid_4.2.1 munsell_0.5.0 codetools_0.2-18
## [7] effectsize_0.7.0 DT_0.23 miniUI_0.1.1.1
## [10] withr_2.5.0 Brobdingnag_1.2-7 colorspace_2.0-3
## [13] rstudioapi_0.13 stats4_4.2.1 robustbase_0.95-0
## [16] bayesplot_1.9.0 listenv_0.8.0 emmeans_1.7.5
## [19] RLRsim_3.1-8 farver_2.1.1 datawizard_0.4.1
## [22] coda_0.19-4 parallelly_1.32.1 vctrs_0.4.1
## [25] generics_0.1.3 xfun_0.31 R6_2.5.1
## [28] markdown_1.1 cachem_1.0.6 assertthat_0.2.1
## [31] promises_1.2.0.1 scales_1.2.0 nnet_7.3-17
## [34] googlesheets4_1.0.0 gtable_0.3.0 globals_0.15.1
## [37] processx_3.7.0 rlang_1.0.4 splines_4.2.1
## [40] gargle_1.2.0 broom_1.0.0 checkmate_2.1.0
## [43] inline_0.3.19 yaml_2.3.5 reshape2_1.4.4
## [46] abind_1.4-5 modelr_0.1.8 threejs_0.3.3
## [49] crosstalk_1.2.0 backports_1.4.1 httpuv_1.6.5
## [52] tensorA_0.36.2 tools_4.2.1 ellipsis_0.3.2
## [55] jquerylib_0.1.4 posterior_1.2.2 ggridges_0.5.3
## [58] plyr_1.8.7 base64enc_0.1-3 ps_1.7.1
## [61] prettyunits_1.1.1 zoo_1.8-10 haven_2.5.0
## [64] fs_1.5.2 survey_4.1-1 magrittr_2.0.3
## [67] colourpicker_1.1.1 reprex_2.0.1 googledrive_2.0.0
## [70] mvtnorm_1.1-3 sjmisc_2.8.9 matrixStats_0.62.0
## [73] hms_1.1.1 shinyjs_2.1.0 mime_0.12
## [76] evaluate_0.15 xtable_1.8-4 shinystan_2.6.0
## [79] pbkrtest_0.5.1 sjstats_0.18.1 readxl_1.4.0
## [82] ggeffects_1.1.2 rstantools_2.2.0 compiler_4.2.1
## [85] crayon_1.5.1 minqa_1.2.4 htmltools_0.5.3
## [88] mgcv_1.8-40 later_1.3.0 tzdb_0.3.0
## [91] RcppParallel_5.1.5 lubridate_1.8.0 DBI_1.1.3
## [94] sjlabelled_1.2.0 dbplyr_2.2.1 MASS_7.3-57
## [97] boot_1.3-28 cli_3.3.0 mitools_2.4
## [100] parallel_4.2.1 insight_0.18.0 igraph_1.3.4
## [103] pkgconfig_2.0.3 numDeriv_2016.8-1.1 binom_1.1-1.1
## [106] xml2_1.3.3 dygraphs_1.1.1.6 bslib_0.4.0
## [109] estimability_1.4 rvest_1.0.2 distributional_0.3.0
## [112] callr_3.7.1 digest_0.6.29 parameters_0.18.1
## [115] fastGHQuad_1.0.1 rmarkdown_2.14 cellranger_1.1.0
## [118] shiny_1.7.1 gtools_3.9.3 nloptr_2.0.3
## [121] lifecycle_1.0.1 nlme_3.1-158 jsonlite_1.8.0
## [124] fansi_1.0.3 pillar_1.8.0 loo_2.5.1
## [127] fastmap_1.1.0 httr_1.4.3 plotrix_3.8-2
## [130] DEoptimR_1.0-11 pkgbuild_1.3.1 survival_3.3-1
## [133] glue_1.6.2 xts_0.12.1 bayestestR_0.12.1
## [136] shinythemes_1.2.0 iterators_1.0.14 stringi_1.7.8
## [139] sass_0.4.2 performance_0.9.1
# Read data ======================================
col_types <- cols(
labid = col_factor(),
subid = col_factor(),
subid_unique = col_factor(),
CDI.form = col_factor(),
CDI.nwords = col_integer(),
CDI.prop = col_number(),
CDI.agerange = col_factor(),
CDI.agedays = col_integer(),
CDI.agemin = col_integer(),
CDI.agemax = col_integer(),
vocab_nwords = col_integer(),
standardized.score.CDI = col_character(),
standardized.score.CDI.num = col_number(),
IDS_pref = col_number(),
language = col_factor(),
language_zone = col_factor(),
CDI.error = col_logical(),
Notes = col_character(),
trial_order = col_factor(),
method = col_factor(),
age_days = col_integer(),
age_mo = col_number(),
age_group = col_factor(),
nae = col_logical(),
gender = col_factor(),
second_session = col_logical()
)
data.total <- read_csv("data/02b_processed.csv", col_types = col_types)
Before moving on with the analysis, we have to ready the data by (a)
checking for colinearity between z_age_months and
CDI.z_age_months and correcting this if necessary, and (b)
setting up the contrasts described in our data analysis.
First, we run a Kappa test on the possibility of colinearity between
z_age_months and CDI.z_age_months.
# Run kappa test
k.age_months <- model.matrix(~ z_age_months + CDI.z_age_months, data = data.total) %>%
kappa(exact = T)
With a value of 3.2587054, we do not have a colinearity issue and can proceed with the data analysis as planned (The criteria of indicating colinearity is that kappa > 10).
We need gender as an effect-coded factor, and
method as a deviation-coded factor. This is achieved in R
by using the contr.sum() function with the number of levels
for each factor. Notably, when subsetting the UK sample, only two levels
of method out of the three in total were left.
# Set contrasts on the total dataset =============
contrasts(data.total$gender) <- contr.sum(2)
contrasts(data.total$method) <- contr.sum(3)
# Create sub-datasets, with contrasts ============
## NAE
data.nae <- data.total %>%
subset(language_zone == "NAE") %>%
droplevels()
contrasts(data.nae$gender) <- contr.sum(2)
contrasts(data.nae$method) <- contr.sum(3)
## UK (combined-age and separate 18/24 months)
data.uk <- data.total %>%
subset(language_zone == "British") %>%
droplevels()
contrasts(data.uk$gender) <- contr.sum(2)
contrasts(data.uk$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
data.uk.18 <- data.total %>%
subset(language_zone == "British" & CDI.agerange ==
"18") %>%
droplevels()
contrasts(data.uk.18$gender) <- contr.sum(2)
contrasts(data.uk.18$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
data.uk.24 <- data.total %>%
subset(language_zone == "British" & CDI.agerange ==
"24") %>%
droplevels()
contrasts(data.uk.24$gender) <- contr.sum(2)
contrasts(data.uk.24$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
## Other
data.other <- data.total %>%
subset(language_zone == "Other") %>%
droplevels()
contrasts(data.other$gender) <- contr.sum(2)
contrasts(data.other$method) <- contr.sum(3)
We first assess the amount of data we have overall per condition and their shape overall.
data.total %>%
group_by(language_zone, CDI.agerange, method, gender) %>%
summarise(N = n(), age = mean(CDI.agedays), sd = sd(CDI.agedays)) %>%
kable()
## `summarise()` has grouped output by 'language_zone', 'CDI.agerange', 'method'.
## You can override using the `.groups` argument.
| language_zone | CDI.agerange | method | gender | N | age | sd |
|---|---|---|---|---|---|---|
| British | 18 | singlescreen | F | 21 | 548.7619 | 9.065896 |
| British | 18 | singlescreen | M | 18 | 548.2778 | 9.448346 |
| British | 18 | eyetracking | F | 8 | 546.6250 | 7.558108 |
| British | 18 | eyetracking | M | 10 | 549.5000 | 8.934950 |
| British | 24 | singlescreen | F | 12 | 734.6667 | 9.345230 |
| British | 24 | singlescreen | M | 6 | 733.8333 | 4.400758 |
| British | 24 | eyetracking | F | 12 | 729.2500 | 9.333469 |
| British | 24 | eyetracking | M | 5 | 737.8000 | 8.228001 |
| Other | 18 | singlescreen | F | 10 | 542.6000 | 6.586181 |
| Other | 18 | singlescreen | M | 12 | 545.4167 | 5.567084 |
| Other | 18 | eyetracking | F | 18 | 548.1667 | 6.862087 |
| Other | 18 | eyetracking | M | 21 | 551.7143 | 6.341474 |
| Other | 18 | hpp | F | 32 | 545.2500 | 6.530326 |
| Other | 18 | hpp | M | 27 | 550.5556 | 7.890273 |
| Other | 24 | singlescreen | F | 8 | 729.0000 | 7.131419 |
| Other | 24 | singlescreen | M | 10 | 726.7000 | 4.372896 |
| Other | 24 | eyetracking | F | 26 | 730.5385 | 5.623030 |
| Other | 24 | eyetracking | M | 23 | 730.1304 | 5.504580 |
| Other | 24 | hpp | F | 31 | 729.0645 | 9.312842 |
| Other | 24 | hpp | M | 25 | 727.4800 | 7.665507 |
| NAE | 18 | singlescreen | F | 19 | 554.9474 | 20.780530 |
| NAE | 18 | singlescreen | M | 14 | 581.2143 | 24.925052 |
| NAE | 18 | eyetracking | F | 1 | 549.0000 | NA |
| NAE | 18 | hpp | F | 64 | 557.9375 | 22.801333 |
| NAE | 18 | hpp | M | 66 | 556.1667 | 22.762035 |
| NAE | 24 | singlescreen | F | 23 | 737.1739 | 26.604258 |
| NAE | 24 | singlescreen | M | 20 | 739.6000 | 21.352678 |
| NAE | 24 | eyetracking | F | 2 | 756.5000 | 34.648232 |
| NAE | 24 | eyetracking | M | 1 | 745.0000 | NA |
| NAE | 24 | hpp | F | 58 | 731.6897 | 28.149480 |
| NAE | 24 | hpp | M | 65 | 726.2769 | 27.133068 |
More detailed information about Descriptive Statistics
# number of lab
data.total %>%
select(labid, language_zone) %>%
unique() %>%
group_by(language_zone) %>%
count()
## # A tibble: 3 × 2
## # Groups: language_zone [3]
## language_zone n
## <fct> <int>
## 1 British 4
## 2 Other 8
## 3 NAE 9
data.total %>%
group_by(language_zone, CDI.agerange) %>%
summarize(N = n())
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups: language_zone [3]
## language_zone CDI.agerange N
## <fct> <fct> <int>
## 1 British 18 57
## 2 British 24 35
## 3 Other 18 120
## 4 Other 24 123
## 5 NAE 18 164
## 6 NAE 24 169
# age range in each age group and language zone
data.total %>%
select(subid, language_zone, CDI.agedays, CDI.agerange) %>%
unique() %>%
group_by(language_zone, CDI.agerange) %>%
summarize(
age_min = (min(CDI.agedays) / 365.25 * 12),
age_max = (max(CDI.agedays) / 365.25 * 12)
)
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: language_zone [3]
## language_zone CDI.agerange age_min age_max
## <fct> <fct> <dbl> <dbl>
## 1 British 18 17.6 18.5
## 2 British 24 23.5 24.5
## 3 Other 18 17.5 18.5
## 4 Other 24 23.5 24.5
## 5 NAE 18 16.1 20.0
## 6 NAE 24 22.1 25.9
We then assess the data per lab in terms of sample size and CDI score (vocabulary size, for consistency between language zones).
by_lab <- data.total %>%
group_by(labid, language_zone, CDI.agerange) %>%
mutate(tested = n_distinct(subid_unique)) %>%
select(labid, language_zone, CDI.agerange, tested, vocab_nwords) %>%
nest(scores = vocab_nwords) %>%
mutate(
model = map(scores, ~ lm(vocab_nwords ~ 1, data = .x)),
ci = map(model, confint)
) %>%
transmute(
tested = tested,
mean = map_dbl(model, ~ coefficients(.x)[[1]]),
ci_lower = map_dbl(ci, 1),
ci_upper = map_dbl(ci, 2)
) %>%
arrange(language_zone) %>%
rownames_to_column() %>%
ungroup()
# created a new column with the labs IDs as character so it can be sorted in alphabetical order
by_lab$labid_car <- as.character(by_lab$labid)
# relabel the CDI age factor column
levels(by_lab$CDI.agerange) <- c("18 months", "24 months")
# Making sure the factor columns have levels in the order I would like to graph them
by_lab$language_zone <- factor(by_lab$language_zone, levels = c("British", "NAE", "Other"))
# sorted the idcolum in the way I would it to show in the ggplot
labid_ord <- by_lab %>%
dplyr::arrange(language_zone, desc(labid_car)) %>%
ungroup() %>%
filter(CDI.agerange == "18 months") %>%
select(labid)
# Making sure the factor columns have levels in the order I would like to graph them
by_lab$labid <- factor(by_lab$labid, levels = labid_ord$labid)
## graph by language zone and lab Id in asc order
by_lab %>%
ggplot(
.,
aes(
x = labid,
y = mean, colour = language_zone,
ymin = ci_lower, ymax = ci_upper
)
) +
geom_linerange() +
geom_point(aes(size = tested)) +
coord_flip(ylim = c(0, 500)) +
xlab("Lab") +
ylab("Vocabulary size") +
scale_colour_brewer(
palette = "Dark2", name = "Language zone",
breaks = c("British", "NAE", "Other")
) +
scale_size_continuous(name = "N", breaks = function(x) c(min(x), mean(x), max(x))) +
facet_wrap(vars(CDI.agerange)) +
theme(
legend.position = "bottom",
strip.background = element_blank(),
strip.placement = "outside",
text = element_text(size = 22)
)
ggsave("plots/vocab-size_by-lab.png", width = 12, height = 8)
First, we want to assess quickly if there is a direct correlation between IDS preference and CDI score, computing a Pearson#’s product-moment correlation. We use standardized CDI scores for the North American sample, and raw scores for the British sample. Since CDI grows with age, we run the British sample separately for 18 and 24 months.
# Statistics =====================================
## North American Sample
test.pearson.nae <- cor.test(data.nae$IDS_pref,
data.nae$z_standardized_CDI,
alternative = "two.sided", method = "pearson"
)
test.pearson.nae
##
## Pearson's product-moment correlation
##
## data: data.nae$IDS_pref and data.nae$z_standardized_CDI
## t = -0.91847, df = 305, p-value = 0.3591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16349833 0.05977293
## sample estimates:
## cor
## -0.05251901
## UK Sample
test.pearson.uk.18 <- cor.test(data.uk.18$IDS_pref,
data.uk.18$z_vocab_nwords,
alternative = "two.sided", method = "pearson"
)
test.pearson.uk.18
##
## Pearson's product-moment correlation
##
## data: data.uk.18$IDS_pref and data.uk.18$z_vocab_nwords
## t = -0.026578, df = 55, p-value = 0.9789
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2639050 0.2572241
## sample estimates:
## cor
## -0.003583759
test.pearson.uk.24 <- cor.test(data.uk.24$IDS_pref,
data.uk.24$z_vocab_nwords,
alternative = "two.sided", method = "pearson"
)
test.pearson.uk.24
##
## Pearson's product-moment correlation
##
## data: data.uk.24$IDS_pref and data.uk.24$z_vocab_nwords
## t = 0.45131, df = 33, p-value = 0.6547
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2617574 0.4010989
## sample estimates:
## cor
## 0.07832108
Plots for correlation
## North American Sample
### Get correlation value for annotation
cor_text <- "paste(italic(R)^2, \" =\")"
cor_value <- round(test.pearson.nae$estimate, 3)
### Build plot
plot.pearson.nae <- data.nae %>%
ggplot(aes(
x = IDS_pref,
y = standardized.score.CDI.num
)) +
xlab("IDS preference") +
ylab("Standardized CDI score") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = -.9, y = 51, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
## UK Sample
cor_value <- round(test.pearson.uk.18$estimate, 3)
plot.pearson.uk.18 <- data.uk.18 %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords
)) +
xlab("IDS preference") +
ylab("Vocabulary size (in words)") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = .8, y = 150, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
cor_value <- round(test.pearson.uk.24$estimate, 3)
plot.pearson.uk.24 <- data.uk.24 %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords
)) +
xlab("IDS preference") +
ylab("Vocabulary size (in words)") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = .8, y = 150, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
# Global plot
plot.pearson <- ggarrange(plot.pearson.nae, plot.pearson.uk.18, plot.pearson.uk.24, ncol = 3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 26 rows containing non-finite values (stat_smooth).
## Warning: Removed 26 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
plot.pearson
# TODO: These plots need to be cleaned up!
ggsave("plots/corr_plot.pdf", plot.pearson,
units = "mm", width = 180, height = 100, dpi = 1000
)
We see no obvious direct link between IDS prefernce and CDI score here. However, an effect might appear once we take into account various factors that might interact with IDS preference and/or CDI score. We can also first enhance these plots with information about the age group at which infants were tested (18- or 24-month-old) for the NAE sample, using vocabulary size to better compare the NAE and UK samples.
plot.age_group <- data.total %>%
subset(language_zone != "Other") %>%
droplevels() %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords,
colour = CDI.agerange
)) +
facet_wrap(vars(language_zone),
labeller = as_labeller(c(
"British" = "UK samples",
"NAE" = "North Amercian samples"
))
) +
xlab("Standardized IDS prefence score") +
ylab("Vocabulary size (in words)") +
theme(legend.position = "top") +
geom_point() +
geom_smooth(method = lm) +
scale_colour_brewer(
palette = "Dark2", name = "Age group",
breaks = c("18", "24"),
labels = c("18mo", "24m")
)
ggsave("plots/scatter_age.pdf", plot.age_group,
units = "mm", width = 180, height = 100, dpi = 1000
)
## `geom_smooth()` using formula 'y ~ x'
plot.age_group
## `geom_smooth()` using formula 'y ~ x'
Here, we run a mixed-effects model including only theoretically motivated effects, as described in the pre-registration. We start with the full model bellow, simplifying the random effects structure until it converges.
# Run models =====================================
## NAE
data.nae$centered_IDS_pref <- scale(data.nae$IDS_pref, center = T, scale = F)
lmer.full.nae <- lmer(standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months + method + centered_IDS_pref + centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique),
data = data.nae
)
summary(lmer.full.nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months +
## method + centered_IDS_pref + centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months +
## centered_IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
## Data: data.nae
##
## REML criterion at convergence: 2795.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.19986 -0.50964 -0.00062 0.40872 2.40426
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid_unique (Intercept) 554.4 23.55
## labid (Intercept) 34.8 5.90
## Residual 219.1 14.80
## Number of obs: 307, groups: subid_unique, 211; labid, 8
##
## Fixed effects:
## Estimate Std. Error df t value
## (Intercept) 31.3285 6.4558 53.8578 4.853
## CDI.z_age_months 1.1559 0.3412 118.3835 3.388
## gender1 -1.4068 1.8876 194.1693 -0.745
## z_age_months -0.1076 0.6285 70.2700 -0.171
## method1 6.4913 6.5640 127.7625 0.989
## method2 -14.9012 11.8935 179.1258 -1.253
## centered_IDS_pref -33.9167 24.7927 208.6284 -1.368
## method1:centered_IDS_pref 26.2532 25.4357 208.7560 1.032
## method2:centered_IDS_pref -56.6834 49.5010 207.8344 -1.145
## CDI.z_age_months:centered_IDS_pref 0.5553 1.0468 120.4990 0.530
## z_age_months:centered_IDS_pref 2.1822 2.0830 190.6377 1.048
## Pr(>|t|)
## (Intercept) 1.08e-05 ***
## CDI.z_age_months 0.000957 ***
## gender1 0.456997
## z_age_months 0.864526
## method1 0.324572
## method2 0.211878
## centered_IDS_pref 0.172780
## method1:centered_IDS_pref 0.303201
## method2:centered_IDS_pref 0.253486
## CDI.z_age_months:centered_IDS_pref 0.596748
## z_age_months:centered_IDS_pref 0.296136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) CDI.z__ gendr1 z_g_mn methd1 methd2 c_IDS_ m1:_ID m2:_ID
## CDI.z_g_mnt -0.059
## gender1 -0.030 0.021
## z_age_mnths 0.036 -0.036 -0.019
## method1 -0.724 0.000 0.021 0.102
## method2 0.883 -0.029 -0.034 0.020 -0.881
## cntrd_IDS_p 0.357 0.006 0.038 -0.040 -0.347 0.384
## mthd1:_IDS_ -0.335 -0.027 -0.027 0.017 0.360 -0.383 -0.926
## mthd2:_IDS_ 0.349 0.020 0.061 -0.007 -0.360 0.390 0.965 -0.971
## CDI.__:_IDS -0.011 0.014 -0.019 -0.019 -0.028 0.008 -0.048 0.026 -0.045
## z_g_m:_IDS_ -0.041 0.025 0.102 0.046 -0.037 0.022 0.022 -0.087 0.132
## CDI.__:
## CDI.z_g_mnt
## gender1
## z_age_mnths
## method1
## method2
## cntrd_IDS_p
## mthd1:_IDS_
## mthd2:_IDS_
## CDI.__:_IDS
## z_g_m:_IDS_ -0.068
# robust_lmer.full.nae <- robustlmm::rlmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
# z_age_months + method + centered_IDS_pref +
# centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
# (1 | labid),
# data = data.nae)
#
#
# summary(robust_lmer.full.nae) #this model is used to see if we can meet some statistical assumption better but we decided to use the original model as the inferential statistical results are consistent
full.nae_pvalue <- anova(lmer.full.nae) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# ==========
# First, check linearity
# data.nae$resid <- residuals(lmer.full.nae)
#
# plot(data.nae$resid, data.nae$standardized.score.CDI)
# Second, check normality
plot_model(lmer.full.nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.nae <- nlme::lme(standardized.score.CDI.num ~ CDI.z_age_months + gender +
z_age_months + method + centered_IDS_pref +
centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.nae, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.nae) # we do see a multicollineartiy for the IDS preference variable, even though we have centered the IDS preference score. It is probably related to the number the participating labs (as this is the group level that we are controlling) and how we entered interaction between IDS preference and other variables (that lack variability in the current sample). We need to keep IDS preference in the model as exploring the relationship between IDS preference and CDI score is the key research question in the paper.
## GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months 1.008664 1 1.004323
## gender 1.038941 1 1.019285
## z_age_months 1.098275 1 1.047986
## method 1.290505 2 1.065835
## centered_IDS_pref 19.218063 1 4.383841
## method:centered_IDS_pref 20.770091 2 2.134812
## CDI.z_age_months:centered_IDS_pref 1.014148 1 1.007049
## z_age_months:centered_IDS_pref 1.316334 1 1.147316
lmer.full.uk <- lmer(vocab_nwords ~ CDI.z_age_months + gender +
z_age_months + method + IDS_pref +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
# (1 | labid) +
(1 | subid_unique),
data = data.uk
)
summary(lmer.full.uk)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: vocab_nwords ~ CDI.z_age_months + gender + z_age_months + method +
## IDS_pref + IDS_pref:method + IDS_pref:CDI.z_age_months +
## IDS_pref:z_age_months + (1 | subid_unique)
## Data: data.uk
##
## REML criterion at convergence: 1001.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.61591 -0.38896 -0.02037 0.37307 1.93394
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid_unique (Intercept) 4834 69.53
## Residual 2209 47.00
## Number of obs: 92, groups: subid_unique, 76
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 178.534 10.936 67.996 16.326 < 2e-16 ***
## CDI.z_age_months 32.357 2.428 30.360 13.326 3.19e-14 ***
## gender1 10.142 10.156 67.389 0.999 0.3215
## z_age_months -7.226 3.621 71.967 -1.996 0.0498 *
## method1 -8.451 13.139 66.946 -0.643 0.5223
## IDS_pref 25.907 27.203 72.778 0.952 0.3441
## method1:IDS_pref -25.499 29.465 72.479 -0.865 0.3897
## CDI.z_age_months:IDS_pref 10.598 7.059 34.815 1.501 0.1423
## z_age_months:IDS_pref -8.352 8.978 78.184 -0.930 0.3551
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) CDI.z__ gendr1 z_g_mn methd1 IDS_pr m1:IDS CDI.__:
## CDI.z_g_mnt 0.183
## gender1 -0.024 -0.091
## z_age_mnths -0.021 -0.210 -0.037
## method1 -0.312 -0.067 -0.165 0.544
## IDS_pref -0.336 -0.060 -0.068 0.042 0.184
## mthd1:IDS_p 0.167 0.111 0.056 -0.146 -0.306 -0.104
## CDI.__:IDS_ -0.095 -0.293 0.019 0.106 0.129 0.233 -0.138
## z_g_mn:IDS_ -0.005 0.110 -0.214 -0.387 -0.136 -0.187 0.425 -0.345
full.uk_pvalue <- anova(lmer.full.uk) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# (1| labid) please note the variance was very little and reported as zero in the results, we needed to remove this random effect
# First, check linearity. The plot looks linear
data.uk$resid <- residuals(lmer.full.uk)
plot(data.uk$resid, data.uk$vocab_nwords)
# Second, check normality
plot_model(lmer.full.uk, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.uk <- nlme::lme(vocab_nwords ~ CDI.z_age_months + gender +
z_age_months + method + IDS_pref +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.uk, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.uk) # no problem for multicollinearlity
## CDI.z_age_months gender z_age_months
## 1.187307 1.141095 1.913758
## method IDS_pref method:IDS_pref
## 1.762512 1.127320 1.445690
## CDI.z_age_months:IDS_pref z_age_months:IDS_pref
## 1.308464 1.899772
We now want to check the statistical power of significant effects,
and discard any models with significant effects that do not reach 80%
power. This however leads to too many warnings of singularity issues on
the model updates inherent to the simr power simulations,
hence we cannot obtain satisfactory power estimates as
pre-registered.
AST: Note that we don’t have any IV(s) that turned out to be significant in the Full NAE model. So we won’t run the power analysis check. For the UK full model, there are two statistically significant IV: CDI_age and gender. The post hoc power check suggested that we have high power in detecting the effect of CDI_age but not gender. Note that gender has a smaller effect size to begin with, so this may partially explain why we have less power in detecting it in the model. As there can be a number of different factors that determines the posthoc power, we decided not to remove gender in the model based on posthoc power analysis check.
check_pwr_uk_cdi_age <- simr::powerSim(lmer.full.uk, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_uk_cdi_age
check_pwr_uk_gender <- simr::powerSim(lmer.full.uk, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_uk_gender
For this combined analysis, we first need to restrain the age range for the NAE sample (previously ±2 months, now ±0.5 months).
# Create dataset with British and NAE only
before_exclusion_participants <- data.total %>%
filter(language_zone == "NAE" | language_zone == "British") %>%
distinct(subid_unique) %>%
count()
before_exclusion_CDIs <- data.total %>%
filter(language_zone == "NAE" | language_zone == "British") %>%
count()
data.uk_nae <- data.total %>%
subset(language_zone %in% c("British", "NAE")) %>%
mutate(
CDI.agemin = ifelse(language_zone == "NAE",
CDI.agemin + round(.5 * 365.25 / 12),
CDI.agemin
),
CDI.agemax = ifelse(language_zone == "NAE",
CDI.agemax - round(.5 * 365.25 / 12),
CDI.agemax
)
) %>%
subset(!(CDI.agedays < CDI.agemin | CDI.agedays > CDI.agemax)) %>%
droplevels()
# Create contrasts for analysis
contrasts(data.uk_nae$gender) <- contr.sum(2)
contrasts(data.uk_nae$method) <- contr.sum(3)
contrasts(data.uk_nae$language_zone) <- contr.sum(2)
after_exclusion_participants <- data.uk_nae %>%
distinct(subid_unique) %>%
count()
after_exclusion_CDIs <- count(data.uk_nae)
We go from 304 to 290 total participants in the combined sample, meaning that 14 participants were excluded from the North American sample. In total, 28 rows of data were removed.
We can then run the planned combined analysis adding the main effect
and interactions of language_zone.
lmer.full.uk_nae <- lmer(CDI.prop ~ CDI.z_age_months + language_zone + gender +
z_age_months + method + IDS_pref + IDS_pref:language_zone +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
(1 | labid) + (1 | subid_unique),
data = data.uk_nae
)
summary(lmer.full.uk_nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDI.prop ~ CDI.z_age_months + language_zone + gender + z_age_months +
## method + IDS_pref + IDS_pref:language_zone + IDS_pref:method +
## IDS_pref:CDI.z_age_months + IDS_pref:z_age_months + (1 |
## labid) + (1 | subid_unique)
## Data: data.uk_nae
##
## REML criterion at convergence: -97.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.89103 -0.50634 -0.00619 0.44106 2.47651
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid_unique (Intercept) 0.020831 0.14433
## labid (Intercept) 0.001502 0.03875
## Residual 0.020069 0.14166
## Number of obs: 397, groups: subid_unique, 290; labid, 13
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.322768 0.020166 13.299171 16.006 4.49e-10 ***
## CDI.z_age_months 0.051004 0.003075 185.198888 16.586 < 2e-16 ***
## language_zone1 0.092405 0.024873 11.320472 3.715 0.00325 **
## gender1 0.029054 0.011460 269.840429 2.535 0.01180 *
## z_age_months -0.003430 0.004184 160.120357 -0.820 0.41365
## method1 -0.005645 0.025656 19.894754 -0.220 0.82810
## method2 -0.005063 0.040255 18.277056 -0.126 0.90128
## IDS_pref 0.007971 0.040343 296.222646 0.198 0.84351
## language_zone1:IDS_pref 0.029449 0.058002 307.606650 0.508 0.61201
## method1:IDS_pref -0.057186 0.052785 306.712106 -1.083 0.27949
## method2:IDS_pref 0.043003 0.087955 311.318982 0.489 0.62524
## CDI.z_age_months:IDS_pref 0.009680 0.008311 184.394430 1.165 0.24565
## z_age_months:IDS_pref -0.004877 0.011250 274.134058 -0.434 0.66497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
combined.full.uk_nae_pvalue <- anova(lmer.full.uk_nae) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref:language_zone
# IDS_pref
# method
# z_age_months
# gender
# language_zone
# CDI.z_age_months
# ==========
# First, check linearity. The plot looks linear
data.uk_nae$resid <- residuals(lmer.full.uk_nae)
plot(data.uk_nae$resid, data.uk_nae$CDI.prop)
# Second, check normality
plot_model(lmer.full.uk_nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.uk_nae <- nlme::lme(CDI.prop ~ CDI.z_age_months + language_zone + gender +
z_age_months + method + IDS_pref + IDS_pref:language_zone +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.uk_nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.uk_nae, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.uk_nae) # no problem for multicollinearlity
## GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months 1.213821 1 1.101736
## language_zone 1.950765 1 1.396698
## gender 1.033569 1 1.016646
## z_age_months 1.362839 1 1.167407
## method 2.449977 2 1.251096
## IDS_pref 1.466093 1 1.210823
## language_zone:IDS_pref 3.213972 1 1.792755
## method:IDS_pref 3.870057 2 1.402585
## CDI.z_age_months:IDS_pref 1.244461 1 1.115554
## z_age_months:IDS_pref 1.459354 1 1.208037
We then compute \(p\)-values, but leave out power estimates for those \(p\)-values as above. Again, we have a lot of singular fit issues for the power checks and decided not to remove parameters based on posthoc power analysis.
check_pwr_combined_cdi_age <- simr::powerSim(lmer.full.uk_nae, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_cdi_age
check_pwr_combined_lang_zone <- simr::powerSim(lmer.full.uk_nae, test = fixed("language_zone", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_lang_zone
check_pwr_combined_gender <- simr::powerSim(lmer.full.uk_nae, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_gender